library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(stringr)
library(imputeTS)
library(DataExplorer)
library(cbanalysis)
library(gvlma)
require(knitr)
## Loading required package: knitr
require(lubridate)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
## 
##     here
## The following object is masked from 'package:base':
## 
##     date
moneyball <- read.csv("https://raw.githubusercontent.com/xkong100/data-621/master/Hw1/moneyball-training-data.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = c("", "NA"))
kable(head(moneyball))
INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
1 39 1445 194 39 13 143 842 NA NA NA 9364 84 927 5456 1011 NA
2 70 1339 219 22 190 685 1075 37 28 NA 1347 191 689 1082 193 155
3 86 1377 232 35 137 602 917 46 27 NA 1377 137 602 917 175 153
4 70 1387 209 38 96 451 922 43 30 NA 1396 97 454 928 164 156
5 82 1297 186 27 102 472 920 49 39 NA 1297 102 472 920 138 168
6 75 1279 200 36 92 443 973 107 59 NA 1279 92 443 973 123 149
nrow(moneyball)
## [1] 2276
ncol(moneyball)
## [1] 17
cleanNames <- function(df) {
    name_list <- names(df)
    name_list <- gsub("TEAM_", "", name_list)
    names(df) <- name_list
    df
}

moneyball <- cleanNames(moneyball)
kable(head(moneyball))
INDEX TARGET_WINS BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
1 39 1445 194 39 13 143 842 NA NA NA 9364 84 927 5456 1011 NA
2 70 1339 219 22 190 685 1075 37 28 NA 1347 191 689 1082 193 155
3 86 1377 232 35 137 602 917 46 27 NA 1377 137 602 917 175 153
4 70 1387 209 38 96 451 922 43 30 NA 1396 97 454 928 164 156
5 82 1297 186 27 102 472 920 49 39 NA 1297 102 472 920 138 168
6 75 1279 200 36 92 443 973 107 59 NA 1279 92 443 973 123 149
There ar e 2276 rows an d 16 columns .
summary(moneyball)
##      INDEX         TARGET_WINS       BATTING_H      BATTING_2B   
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##    BATTING_3B       BATTING_HR       BATTING_BB      BATTING_SO    
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##    BASERUN_SB      BASERUN_CS     BATTING_HBP      PITCHING_H   
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00   Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50   1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00   Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36   Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00   3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00   Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                   
##   PITCHING_HR     PITCHING_BB      PITCHING_SO        FIELDING_E    
##  Min.   :  0.0   Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0   1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0   Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7   Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0   3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0   Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                   NA's   :102                       
##   FIELDING_DP   
##  Min.   : 52.0  
##  1st Qu.:131.0  
##  Median :149.0  
##  Mean   :146.4  
##  3rd Qu.:164.0  
##  Max.   :228.0  
##  NA's   :286
attach(moneyball)
sd(TARGET_WINS)
## [1] 15.75215
sd(BATTING_H)
## [1] 144.5912
sd(BATTING_2B)
## [1] 46.80141
sd(BATTING_3B)
## [1] 27.93856
sd(BATTING_HR)
## [1] 60.54687
sd(BATTING_BB)
## [1] 122.6709
sd(BATTING_SO,na.rm = TRUE)
## [1] 248.5264
sd(BASERUN_SB,na.rm = TRUE)
## [1] 87.79117
sd(BASERUN_CS,na.rm = TRUE)
## [1] 22.95634
sd(BATTING_HBP,na.rm = TRUE)
## [1] 12.96712
sd(PITCHING_H)
## [1] 1406.843
sd(PITCHING_HR)
## [1] 61.29875
sd(PITCHING_BB)
## [1] 166.3574
sd(PITCHING_SO,na.rm = TRUE)
## [1] 553.085
sd(FIELDING_E)
## [1] 227.771
sd(FIELDING_DP, na.rm =TRUE)
## [1] 26.22639
boxplot(TARGET_WINS)

boxplot(BATTING_H)

boxplot(BATTING_2B)

boxplot(BATTING_3B)

boxplot(BATTING_HR)

boxplot(BATTING_BB)

boxplot(BATTING_SO,na.rm = TRUE)

boxplot(BASERUN_SB,na.rm = TRUE)

boxplot(BASERUN_CS,na.rm = TRUE)

boxplot(BATTING_HBP,na.rm = TRUE)

boxplot(PITCHING_H)

boxplot(PITCHING_HR)

boxplot(PITCHING_BB)

boxplot(PITCHING_SO,na.rm = TRUE)

boxplot(FIELDING_E)

boxplot(FIELDING_DP, na.rm =TRUE)

plot_histogram(moneyball)

plot_missing(moneyball)

moneyball[is.na(moneyball$BATTING_SO),"BATTING_SO"] <- median(na.omit(moneyball$BATTING_SO))
moneyball[is.na(moneyball$PITCHING_SO),"PITCHING_SO"] <- median(na.omit(moneyball$PITCHING_SO))
moneyball[is.na(moneyball$BASERUN_SB),"BASERUN_SB"] <- median(na.omit(moneyball$BASERUN_SB))
moneyball[is.na(moneyball$FIELDING_DP),"FIELDING_DP"] <- median(na.omit(moneyball$FIELDING_DP))
moneyball[is.na(moneyball$BASERUN_CS),"BASERUN_CS"] <- median(na.omit(moneyball$BASERUN_CS))
plot_missing(moneyball)

# By observing the data, I see that TEAM_BATTING_H is the sum of 1B, 2B, 3B, HR, to find 1B and analyze them individually, we can find 1B.
attach(moneyball)
## The following objects are masked from moneyball (pos = 3):
## 
##     BASERUN_CS, BASERUN_SB, BATTING_2B, BATTING_3B, BATTING_BB,
##     BATTING_H, BATTING_HBP, BATTING_HR, BATTING_SO, FIELDING_DP,
##     FIELDING_E, INDEX, PITCHING_BB, PITCHING_H, PITCHING_HR,
##     PITCHING_SO, TARGET_WINS
moneyball<- moneyball %>% mutate(BATTING_1B=BATTING_H-BATTING_2B-BATTING_3B,Total_batting=1*BATTING_1B+2*BATTING_2B+3*BATTING_3B+4*BATTING_HR) %>% dplyr ::select(-BATTING_H,-BATTING_HBP,-INDEX)
kable(head(moneyball))
TARGET_WINS BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP BATTING_1B Total_batting
39 194 39 13 143 842 101 49 9364 84 927 5456 1011 149 1212 1769
70 219 22 190 685 1075 37 28 1347 191 689 1082 193 155 1098 2362
86 232 35 137 602 917 46 27 1377 137 602 917 175 153 1110 2227
70 209 38 96 451 922 43 30 1396 97 454 928 164 156 1140 2056
82 186 27 102 472 920 49 39 1297 102 472 920 138 168 1084 1945
75 200 36 92 443 973 107 59 1279 92 443 973 123 149 1043 1919
cor(moneyball)
##               TARGET_WINS  BATTING_2B   BATTING_3B BATTING_HR  BATTING_BB
## TARGET_WINS    1.00000000  0.28910365  0.142608411  0.1761532  0.23255986
## BATTING_2B     0.28910365  1.00000000 -0.107305824  0.4353973  0.25572610
## BATTING_3B     0.14260841 -0.10730582  1.000000000 -0.6355669 -0.28723584
## BATTING_HR     0.17615320  0.43539729 -0.635566946  1.0000000  0.51373481
## BATTING_BB     0.23255986  0.25572610 -0.287235841  0.5137348  1.00000000
## BATTING_SO    -0.03058135  0.15173438 -0.655709613  0.6930076  0.37148892
## BASERUN_SB     0.12361087 -0.18340432  0.485740156 -0.4068891 -0.04268402
## BASERUN_CS     0.01595982 -0.04584955  0.136181182 -0.2254587 -0.04581766
## PITCHING_H    -0.10993705  0.02369219  0.194879411 -0.2501455 -0.44977762
## PITCHING_HR    0.18901373  0.45455082 -0.567836679  0.9693714  0.45955207
## PITCHING_BB    0.12417454  0.17805420 -0.002224148  0.1369276  0.48936126
## PITCHING_SO   -0.07579967  0.06213042 -0.254238104  0.1774182 -0.02017989
## FIELDING_E    -0.17648476 -0.23515099  0.509778447 -0.5873391 -0.65597081
## FIELDING_DP   -0.03008630  0.25696798 -0.227771884  0.3916524  0.32963974
## BATTING_1B     0.34579395  0.33580405  0.347822719 -0.0318712 -0.12886347
## Total_batting  0.39892151  0.75439415 -0.136638042  0.7493183  0.36408258
##                BATTING_SO  BASERUN_SB  BASERUN_CS  PITCHING_H PITCHING_HR
## TARGET_WINS   -0.03058135  0.12361087  0.01595982 -0.10993705  0.18901373
## BATTING_2B     0.15173438 -0.18340432 -0.04584955  0.02369219  0.45455082
## BATTING_3B    -0.65570961  0.48574016  0.13618118  0.19487941 -0.56783668
## BATTING_HR     0.69300765 -0.40688907 -0.22545867 -0.25014548  0.96937140
## BATTING_BB     0.37148892 -0.04268402 -0.04581766 -0.44977762  0.45955207
## BATTING_SO     1.00000000 -0.21178758 -0.10250193 -0.37571553  0.63286033
## BASERUN_SB    -0.21178758  1.00000000  0.23324171  0.03957227 -0.38005624
## BASERUN_CS    -0.10250193  0.23324171  1.00000000 -0.05259183 -0.22818525
## PITCHING_H    -0.37571553  0.03957227 -0.05259183  1.00000000 -0.14161276
## PITCHING_HR    0.63286033 -0.38005624 -0.22818525 -0.14161276  1.00000000
## PITCHING_BB    0.03498809  0.12928969 -0.04722893  0.32067616  0.22193750
## PITCHING_SO    0.41618159 -0.06424741 -0.05653800  0.26693587  0.19691491
## FIELDING_E    -0.58259305  0.32615276 -0.02917821  0.66775901 -0.49314447
## FIELDING_DP    0.11089804 -0.27023400 -0.10200214 -0.04464784  0.38959550
## BATTING_1B    -0.48464372  0.09474682 -0.01375594  0.33253091  0.04579447
## Total_batting  0.24141254 -0.21340675 -0.16245457 -0.01596413  0.77829405
##                PITCHING_BB  PITCHING_SO  FIELDING_E  FIELDING_DP
## TARGET_WINS    0.124174536 -0.075799674 -0.17648476 -0.030086302
## BATTING_2B     0.178054204  0.062130422 -0.23515099  0.256967975
## BATTING_3B    -0.002224148 -0.254238104  0.50977845 -0.227771884
## BATTING_HR     0.136927564  0.177418187 -0.58733910  0.391652434
## BATTING_BB     0.489361263 -0.020179893 -0.65597081  0.329639737
## BATTING_SO     0.034988093  0.416181592 -0.58259305  0.110898035
## BASERUN_SB     0.129289686 -0.064247407  0.32615276 -0.270234003
## BASERUN_CS    -0.047228927 -0.056538002 -0.02917821 -0.102002137
## PITCHING_H     0.320676162  0.266935871  0.66775901 -0.044647837
## PITCHING_HR    0.221937505  0.196914911 -0.49314447  0.389595503
## PITCHING_BB    1.000000000  0.482172000 -0.02283756  0.192348657
## PITCHING_SO    0.482172000  1.000000000 -0.02332278  0.009552324
## FIELDING_E    -0.022837561 -0.023322782  1.00000000 -0.227394807
## FIELDING_DP    0.192348657  0.009552324 -0.22739481  1.000000000
## BATTING_1B     0.047792487 -0.279280625  0.31333793  0.110655544
## Total_batting  0.182240384 -0.012524535 -0.28654467  0.371830404
##                BATTING_1B Total_batting
## TARGET_WINS    0.34579395    0.39892151
## BATTING_2B     0.33580405    0.75439415
## BATTING_3B     0.34782272   -0.13663804
## BATTING_HR    -0.03187120    0.74931833
## BATTING_BB    -0.12886347    0.36408258
## BATTING_SO    -0.48464372    0.24141254
## BASERUN_SB     0.09474682   -0.21340675
## BASERUN_CS    -0.01375594   -0.16245457
## PITCHING_H     0.33253091   -0.01596413
## PITCHING_HR    0.04579447    0.77829405
## PITCHING_BB    0.04779249    0.18224038
## PITCHING_SO   -0.27928062   -0.01252454
## FIELDING_E     0.31333793   -0.28654467
## FIELDING_DP    0.11065554    0.37183040
## BATTING_1B     1.00000000    0.54607257
## Total_batting  0.54607257    1.00000000

Model 1: Backwards Selection

m1 <-lm(TARGET_WINS~.,data=moneyball)
plot(m1)

summary(m1)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = moneyball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.753  -8.626   0.120   8.395  58.561 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   23.6421579  5.3902272   4.386 1.21e-05 ***
## BATTING_2B     0.0279578  0.0073363   3.811 0.000142 ***
## BATTING_3B     0.1133940  0.0159335   7.117 1.48e-12 ***
## BATTING_HR     0.0527325  0.0274915   1.918 0.055219 .  
## BATTING_BB     0.0104483  0.0058377   1.790 0.073621 .  
## BATTING_SO    -0.0084323  0.0025461  -3.312 0.000941 ***
## BASERUN_SB     0.0254236  0.0043565   5.836 6.12e-09 ***
## BASERUN_CS    -0.0110027  0.0157842  -0.697 0.485829    
## PITCHING_H    -0.0008456  0.0003674  -2.302 0.021444 *  
## PITCHING_HR    0.0129626  0.0243894   0.531 0.595135    
## PITCHING_BB    0.0007798  0.0041571   0.188 0.851231    
## PITCHING_SO    0.0028156  0.0009219   3.054 0.002284 ** 
## FIELDING_E    -0.0195325  0.0024609  -7.937 3.23e-15 ***
## FIELDING_DP   -0.1217801  0.0129421  -9.410  < 2e-16 ***
## BATTING_1B     0.0489152  0.0036949  13.239  < 2e-16 ***
## Total_batting         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 2261 degrees of freedom
## Multiple R-squared:  0.3154, Adjusted R-squared:  0.3111 
## F-statistic:  74.4 on 14 and 2261 DF,  p-value: < 2.2e-16

Model 2 Get rid of “Total_Batting” for now

m2 <- update(m1,~.-Total_batting)
plot(m2)

summary(m2)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_HR + 
##     BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + 
##     PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP + 
##     BATTING_1B, data = moneyball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.753  -8.626   0.120   8.395  58.561 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 23.6421579  5.3902272   4.386 1.21e-05 ***
## BATTING_2B   0.0279578  0.0073363   3.811 0.000142 ***
## BATTING_3B   0.1133940  0.0159335   7.117 1.48e-12 ***
## BATTING_HR   0.0527325  0.0274915   1.918 0.055219 .  
## BATTING_BB   0.0104483  0.0058377   1.790 0.073621 .  
## BATTING_SO  -0.0084323  0.0025461  -3.312 0.000941 ***
## BASERUN_SB   0.0254236  0.0043565   5.836 6.12e-09 ***
## BASERUN_CS  -0.0110027  0.0157842  -0.697 0.485829    
## PITCHING_H  -0.0008456  0.0003674  -2.302 0.021444 *  
## PITCHING_HR  0.0129626  0.0243894   0.531 0.595135    
## PITCHING_BB  0.0007798  0.0041571   0.188 0.851231    
## PITCHING_SO  0.0028156  0.0009219   3.054 0.002284 ** 
## FIELDING_E  -0.0195325  0.0024609  -7.937 3.23e-15 ***
## FIELDING_DP -0.1217801  0.0129421  -9.410  < 2e-16 ***
## BATTING_1B   0.0489152  0.0036949  13.239  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 2261 degrees of freedom
## Multiple R-squared:  0.3154, Adjusted R-squared:  0.3111 
## F-statistic:  74.4 on 14 and 2261 DF,  p-value: < 2.2e-16

Model 3: get rid of variables are not statistically significant.

m3<-update(m1,~.-BATTING_HR-BATTING_BB-BASERUN_CS-PITCHING_HR-PITCHING_BB)
plot(m3)

summary(m3)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_2B + BATTING_3B + BATTING_SO + 
##     BASERUN_SB + PITCHING_H + PITCHING_SO + FIELDING_E + FIELDING_DP + 
##     BATTING_1B + Total_batting, data = moneyball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.028  -8.645   0.079   8.538  58.554 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   28.5417286  4.9219496   5.799 7.61e-09 ***
## BATTING_2B    -0.0094319  0.0094685  -0.996 0.319289    
## BATTING_3B     0.0605133  0.0156578   3.865 0.000114 ***
## BATTING_SO    -0.0099177  0.0024244  -4.091 4.45e-05 ***
## BASERUN_SB     0.0298779  0.0039613   7.542 6.64e-14 ***
## PITCHING_H    -0.0008022  0.0003216  -2.494 0.012699 *  
## PITCHING_SO    0.0029552  0.0006735   4.388 1.20e-05 ***
## FIELDING_E    -0.0226345  0.0021496 -10.530  < 2e-16 ***
## FIELDING_DP   -0.1125269  0.0126728  -8.879  < 2e-16 ***
## BATTING_1B     0.0284034  0.0050326   5.644 1.87e-08 ***
## Total_batting  0.0191872  0.0023145   8.290  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.1 on 2265 degrees of freedom
## Multiple R-squared:  0.3115, Adjusted R-squared:  0.3085 
## F-statistic: 102.5 on 10 and 2265 DF,  p-value: < 2.2e-16

Model4: Keep getting rid of the insignificant variable.

m4 <-update(m3,~.-BATTING_2B-PITCHING_H)
plot(m4)

summary(m4)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_3B + BATTING_SO + BASERUN_SB + 
##     PITCHING_SO + FIELDING_E + FIELDING_DP + BATTING_1B + Total_batting, 
##     data = moneyball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.140  -8.690   0.037   8.443  59.016 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   27.996811   4.798551   5.834 6.17e-09 ***
## BATTING_3B     0.069968   0.015282   4.578 4.94e-06 ***
## BATTING_SO    -0.007831   0.002273  -3.446 0.000580 ***
## BASERUN_SB     0.030903   0.003936   7.851 6.33e-15 ***
## PITCHING_SO    0.002079   0.000591   3.517 0.000444 ***
## FIELDING_E    -0.025823   0.001712 -15.081  < 2e-16 ***
## FIELDING_DP   -0.111019   0.012628  -8.792  < 2e-16 ***
## BATTING_1B     0.028995   0.004849   5.979 2.60e-09 ***
## Total_batting  0.017044   0.001734   9.830  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.11 on 2267 degrees of freedom
## Multiple R-squared:  0.3093, Adjusted R-squared:  0.3068 
## F-statistic: 126.9 on 8 and 2267 DF,  p-value: < 2.2e-16

Evaluation by using our model

evaluation <- read.csv("https://raw.githubusercontent.com/xkong100/data-621/master/Hw1/moneyball-evaluation-data.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = c("", "NA"))
kable(head(evaluation))
INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
9 1209 170 33 83 447 1080 62 50 NA 1209 83 447 1080 140 156
10 1221 151 29 88 516 929 54 39 NA 1221 88 516 929 135 164
14 1395 183 29 93 509 816 59 47 NA 1395 93 509 816 156 153
47 1539 309 29 159 486 914 148 57 42 1539 159 486 914 124 154
60 1445 203 68 5 95 416 NA NA NA 3902 14 257 1123 616 130
63 1431 236 53 10 215 377 NA NA NA 2793 20 420 736 572 105
evaluation <- cleanNames(evaluation)
kable(head(evaluation))
INDEX BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO BASERUN_SB BASERUN_CS BATTING_HBP PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
9 1209 170 33 83 447 1080 62 50 NA 1209 83 447 1080 140 156
10 1221 151 29 88 516 929 54 39 NA 1221 88 516 929 135 164
14 1395 183 29 93 509 816 59 47 NA 1395 93 509 816 156 153
47 1539 309 29 159 486 914 148 57 42 1539 159 486 914 124 154
60 1445 203 68 5 95 416 NA NA NA 3902 14 257 1123 616 130
63 1431 236 53 10 215 377 NA NA NA 2793 20 420 736 572 105
plot_missing(evaluation)

evaluation[is.na(evaluation$BATTING_SO),"BATTING_SO"] <- median(na.omit(evaluation$BATTING_SO))
evaluation[is.na(evaluation$PITCHING_SO),"PITCHING_SO"] <- median(na.omit(evaluation$PITCHING_SO))
evaluation[is.na(evaluation$BASERUN_SB),"BASERUN_SB"] <- median(na.omit(evaluation$BASERUN_SB))
evaluation[is.na(evaluation$FIELDING_DP),"FIELDING_DP"] <- median(na.omit(evaluation$FIELDING_DP))
evaluation[is.na(evaluation$BASERUN_CS),"BASERUN_CS"] <- median(na.omit(evaluation$BASERUN_CS))
plot_missing(evaluation)

attach(evaluation)
## The following objects are masked from moneyball (pos = 3):
## 
##     BASERUN_CS, BASERUN_SB, BATTING_2B, BATTING_3B, BATTING_BB,
##     BATTING_H, BATTING_HBP, BATTING_HR, BATTING_SO, FIELDING_DP,
##     FIELDING_E, INDEX, PITCHING_BB, PITCHING_H, PITCHING_HR,
##     PITCHING_SO
## The following objects are masked from moneyball (pos = 4):
## 
##     BASERUN_CS, BASERUN_SB, BATTING_2B, BATTING_3B, BATTING_BB,
##     BATTING_H, BATTING_HBP, BATTING_HR, BATTING_SO, FIELDING_DP,
##     FIELDING_E, INDEX, PITCHING_BB, PITCHING_H, PITCHING_HR,
##     PITCHING_SO
evaluation<- evaluation %>% mutate(BATTING_1B=BATTING_H-BATTING_2B-BATTING_3B,Total_batting=1*BATTING_1B+2*BATTING_2B+3*BATTING_3B+4*BATTING_HR) %>% dplyr ::select(-BATTING_H,-BATTING_HBP,-INDEX)
kable(head(evaluation))
BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP BATTING_1B Total_batting
170 33 83 447 1080 62 50.0 1209 83 447 1080 140 156 1006 1777
151 29 88 516 929 54 39.0 1221 88 516 929 135 164 1041 1782
183 29 93 509 816 59 47.0 1395 93 509 816 156 153 1183 2008
309 29 159 486 914 148 57.0 1539 159 486 914 124 154 1201 2542
203 68 5 95 416 92 49.5 3902 14 257 1123 616 130 1174 1804
236 53 10 215 377 92 49.5 2793 20 420 736 572 105 1142 1813
pred <- predict(m1, evaluation,type='response')
## Warning in predict.lm(m1, evaluation, type = "response"): prediction from a
## rank-deficient fit may be misleading
final <- data.frame(cbind(pred, moneyball$TARGET_WINS))
## Warning in cbind(pred, moneyball$TARGET_WINS): number of rows of result is
## not a multiple of vector length (arg 1)
kable(head(final))
pred V2
64.02285 39
65.73235 70
75.27890 86
85.74341 70
66.37855 82
69.83817 75